Sidebar for jupyter notebook available at: https://github.com/shoval/jupyter-navbar
# Sandbox calcs from lecture!
# import numpy as np
# k = np.array([[37.1,-6.55],
# [-6.55,43.99]])
# eigenvalues, eigenvectors = np.linalg.eig(k)
# print (eigenvectors, eigenvalues)
Here we are going to start with the iris dataset. This is a very simple dataset that is four dimensions. Details about this data are rampant on the web.
Let's get started. We will use scikit for this analysis, which has a built-in loading mechanism for the iris dataset.
from sklearn import datasets as ds
iris = ds.load_iris() # wow that was simple to load up!
print (iris['DESCR']) # a description of the dataset
# let's convert it to a pandas data frame and visualize it
import pandas as pd
import numpy as np
df = pd.DataFrame(iris.data,columns=iris.feature_names)
# add in the class targets and names
df['target'] = iris.target.astype(np.int)
print (df.info())
df.head()
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()
sns.pairplot(df, hue="target", size=2)
Now let's use PCA to find the two "best" dimensions of this data These are linear transforms to help project the features into something more understandable
from sklearn.decomposition import PCA
X = iris.data
y = iris.target
target_names = iris.target_names
pca = PCA(n_components=2)
X_pca = pca.fit(X).transform(X) # fit data and then transform it
# print the components
print ('pca:', pca.components_)
import seaborn as sns
cmap = sns.set(style="darkgrid")
# this function definition just formats the weights into readable strings
# you can skip it without loss of generality to the Data Science content
def get_feature_names_from_weights(weights, names):
tmp_array = []
for comp in weights:
tmp_string = ''
for fidx,f in enumerate(names):
if fidx>0 and comp[fidx]>=0:
tmp_string+='+'
tmp_string += '%.2f*%s ' % (comp[fidx],f[:-5])
tmp_array.append(tmp_string)
return tmp_array
plt.style.use('ggplot')
# now let's get to the Data Analytics!
pca_weight_strings = get_feature_names_from_weights(pca.components_, iris.feature_names)
# create some pandas dataframes from the transformed outputs
df_pca = pd.DataFrame(X_pca,columns=[pca_weight_strings])
from matplotlib.pyplot import scatter
# scatter plot the output, with the names created from the weights
ax = scatter(X_pca[:,0], X_pca[:,1], c=y, s=(y+2)*10, cmap=cmap)
plt.xlabel(pca_weight_strings[0])
plt.ylabel(pca_weight_strings[1])
Biplots are visualizations that allow you to see how much of each principle component is represented by each feature. They are used many times in survey data results to display the responses of multiple survey questions. An example biplots used in this fashion is available here: https://datasciencebe.files.wordpress.com/2015/03/pca-skillscores.png

# Manipulated example from https://github.com/teddyroland/python-biplot/blob/master/biplot.py
def biplot(pca, dat, title=''):
import plotly
from plotly.graph_objs import Scatter, Marker, Layout, XAxis, YAxis, Bar, Line
plotly.offline.init_notebook_mode() # run at the start of every notebook
# 0,1 denote PC1 and PC2; change values for other PCs
xvector = pca.components_[0]
yvector = pca.components_[1]
tmp = pca.transform(dat.values)
xs = tmp[:,0]
ys = tmp[:,1]
annotations = [Scatter(x=xs, y=ys, mode ='markers',
marker=Marker(color=iris.target),
name='PCA Trans. Data')]
for i in range(len(xvector)):
txt = list(dat.columns.values)[i]
annotations.append(
Scatter(
x=[0, xvector[i]*max(xs)],
y=[0, yvector[i]*max(ys)],
mode='lines+text',
text=['', txt],
name=txt,
))
plotly.offline.iplot({
"data": annotations,
"layout": Layout(xaxis=XAxis(title='Principal Component One'),
yaxis=YAxis(title='Principal Component Two'),
title=title)
})
plt.show()
X = iris.data
pca = PCA(n_components=4)
pca.fit(X)
biplot(pca,pd.DataFrame(iris.data,columns=iris.feature_names),'Iris Biplot')
Recall that the explained variance is a factor of the eigen values. We can access these via the following equation:
$$ r_q=\frac{\sum_{j=1}^q \lambda_j}{\sum_{j=1}^p \lambda_j} $$
# manipulated from Sebastian Raschka Example (your book!)
# also from hi blog here: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html
def plot_explained_variance(pca):
import plotly
from plotly.graph_objs import Scatter, Marker, Layout, XAxis, YAxis, Bar, Line
plotly.offline.init_notebook_mode() # run at the start of every notebook
explained_var = pca.explained_variance_ratio_
cum_var_exp = np.cumsum(explained_var)
plotly.offline.iplot({
"data": [Bar(y=explained_var, name='individual explained variance'),
Scatter(y=cum_var_exp, name='cumulative explained variance')
],
"layout": Layout(xaxis=XAxis(title='Principal components'), yaxis=YAxis(title='Explained variance ratio'))
})
pca = PCA(n_components=4)
X_pca = pca.fit(X)
plot_explained_variance(pca)
This is code manipulated from Olivier Grisel's eigen face classification demonstration. You can find the original notebook here: http://nbviewer.ipython.org/github/ogrisel/notebooks/blob/master/Labeled%20Faces%20in%20the%20Wild%20recognition.ipynb
# fetch the images for the dataset
# this will take a long time the first run because it needs to download
# after the first time, the dataset will be save to your disk (in sklearn package somewhere)
# if this does not run, you may need additional libraries installed on your system (install at your own risk!!)
from sklearn.datasets import fetch_lfw_people
lfw_people = fetch_lfw_people(min_faces_per_person=20, resize=None)
# get some of the specifics of the dataset
X = lfw_people.data
y = lfw_people.target
names = lfw_people.target_names
n_samples, n_features = X.shape
_, h, w = lfw_people.images.shape
n_classes = len(names)
print(np.sum(~np.isfinite(X)))
print("n_samples: {}".format(n_samples))
print("n_features: {}".format(n_features))
print("n_classes: {}".format(n_classes))
print("Original Image Sizes {} by {}".format(h,w))
print (125*94) # the size of the images are the size of the feature vectors
So basically each feature vector is a giant image with the rows of the image just stacked one after the other into a giant vector. The image sizes are 125 pixels by 94 pixels. This gives us 125x94=11750 pixels per image.
So we are using each pixel location in the image as a separate feature. Wow. That's a lot of features. The process is as follows:

# a helper plotting function
def plot_gallery(images, titles, h, w, n_row=3, n_col=6):
"""Helper function to plot a gallery of portraits"""
plt.figure(figsize=(1.7 * n_col, 2.3 * n_row))
plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
for i in range(n_row * n_col):
plt.subplot(n_row, n_col, i + 1)
plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray)
plt.title(titles[i], size=12)
plt.xticks(())
plt.yticks(())
plot_gallery(X, names[y], h, w) # defaults to showing a 3 by 6 subset of the faces
# lets do some PCA of the features and go from 1850 features to 20 features
from sklearn.decomposition import PCA
n_components = 300
print ("Extracting the top %d eigenfaces from %d faces" % (
n_components, X.shape[0]))
pca = PCA(n_components=n_components)
%time pca.fit(X.copy())
eigenfaces = pca.components_.reshape((n_components, h, w))
plot_explained_variance(pca)
eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])]
plot_gallery(eigenfaces, eigenface_titles, h, w)
def reconstruct_image(trans_obj,org_features):
low_rep = trans_obj.transform(org_features)
rec_image = trans_obj.inverse_transform(low_rep)
return low_rep, rec_image
idx_to_reconstruct = 4
X_idx = X[idx_to_reconstruct]
low_dimensional_representation, reconstructed_image = reconstruct_image(pca,X_idx.reshape(1, -1))
plt.subplot(1,2,1)
plt.imshow(X_idx.reshape((h, w)), cmap=plt.cm.gray)
plt.title('Original')
plt.grid()
plt.subplot(1,2,2)
plt.imshow(reconstructed_image.reshape((h, w)), cmap=plt.cm.gray)
plt.title('Reconstructed from Full PCA')
plt.grid()
Of course, the methods that are used to calculate eigenvectors (the components) do not scale well with the number of features (in this case, the image dimensions). Recall that PCA works on the covariance matrix and the covariance matrix size is not affected by the number of examples, only the number of features. Randomized PCA helps to mitigate this by formulating the eigenvectors of a lower rank matrix using randomized projections.
More information here:
# lets do some PCA of the features and go from 1850 features to 300 features
n_components = 300
print ("Extracting the top %d eigenfaces from %d faces" % (
n_components, X.shape[0]))
rpca = PCA(n_components=n_components,svd_solver='randomized')
%time rpca.fit(X.copy())
eigenfaces = rpca.components_.reshape((n_components, h, w))
eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])]
plot_gallery(eigenfaces, eigenface_titles, h, w)
Warning: the block of code below takes a long time to run. Kernel methods typically take some time, so I would advise subsampling the data before using the kernel.
%%time
from sklearn.decomposition import KernelPCA
n_components = 300
print ("Extracting the top %d eigenfaces from %d faces, not calculating inverse transform" % (n_components, X.shape[0]))
kpca = KernelPCA(n_components=n_components, kernel='rbf',
fit_inverse_transform=False, gamma=12, # very sensitive to the gamma parameter,
remove_zero_eig=True)
kpca.fit(X.copy())
%%time
# THIS TAKES A LONG TIME TO RUN
from sklearn.decomposition import KernelPCA
n_components = 300
print ("Extracting the top %d eigenfaces from %d faces, ALSO getting inverse transform" % (n_components, X.shape[0]))
kpca = KernelPCA(n_components=n_components, kernel='rbf',
fit_inverse_transform=True, gamma=12, # very sensitive to the gamma parameter,
remove_zero_eig=True)
kpca.fit(X.copy())
# the above operation takes a long time to save the inverse transform parameters
# so let's save out the results to load in later!
import pickle
pickle.dump(kpca, open( 'large_data/kpca.p', 'wb' ))
# Load the Kpca
import pickle
kpca = pickle.load(open( 'large_data/kpca.p', 'rb' ))
# widgets example
from ipywidgets import widgets # make this interactive!
widgets.interact(lambda x: print(x),x=(0,5,1))
import warnings
# warnings.simplefilter('ignore', DeprecationWarning)
# warnings.simplefilter("always",DeprecationWarning)
def plt_reconstruct(idx_to_reconstruct):
idx_to_reconstruct = np.round(idx_to_reconstruct)
reconstructed_image = pca.inverse_transform(pca.transform(X[idx_to_reconstruct].reshape(1, -1)))
reconstructed_image_rpca = rpca.inverse_transform(rpca.transform(X[idx_to_reconstruct].reshape(1, -1)))
reconstructed_image_kpca = kpca.inverse_transform(kpca.transform(X[idx_to_reconstruct].reshape(1, -1)))
plt.figure(figsize=(15,7))
plt.subplot(1,4,1)
plt.imshow(X[idx_to_reconstruct].reshape((h, w)), cmap=plt.cm.gray)
plt.title(names[y[idx_to_reconstruct]])
plt.grid()
plt.subplot(1,4,2)
plt.imshow(reconstructed_image.reshape((h, w)), cmap=plt.cm.gray)
plt.title('Full PCA')
plt.grid()
plt.subplot(1,4,3)
plt.imshow(reconstructed_image_rpca.reshape((h, w)), cmap=plt.cm.gray)
plt.title('Randomized PCA')
plt.grid()
plt.subplot(1,4,4)
plt.imshow(reconstructed_image_kpca.reshape((h, w)), cmap=plt.cm.gray)
plt.title('Kernel PCA')
plt.grid()
widgets.interact(plt_reconstruct,idx_to_reconstruct=(0,n_samples-1,1),__manual=True)
Now let's look at other Methods for extracting features from images.
Let's start by calculating gradients:
from skimage.io import imshow
idx_to_reconstruct = int(np.random.rand(1)*len(X))
img = X[idx_to_reconstruct].reshape((h,w))
imshow(img)
plt.grid()
from skimage.filters import sobel_h, sobel_v
gradient_mag = np.sqrt(sobel_v(img)**2 + sobel_h(img)**2 )
imshow(gradient_mag)
plt.grid()
As discussed in lecture, DAISY Features are a means of looking at histograms of edges weights using gradients at various orientations (if you are new to this, that is not a satisfying explanation, please look back at the notes/slides for the course.).
http://scikit-image.org/docs/dev/_images/sphx_glr_plot_daisy_001.png
from skimage.feature import daisy
# lets first visualize what the daisy descripto looks like
features, img_desc = daisy(img,step=40, radius=10, rings=3, histograms=5, orientations=8, visualize=True)
imshow(img_desc)
plt.grid()
# now let's understand how to use it
features = daisy(img,step=10, radius=10, rings=2, histograms=4, orientations=8, visualize=False)
print(features.shape)
print(features.shape[0]*features.shape[1]*features.shape[2])
# create a function to tak in the row of the matric and return a new feature
def apply_daisy(row,shape):
feat = daisy(row.reshape(shape),step=10, radius=10, rings=2, histograms=6, orientations=8, visualize=False)
return feat.reshape((-1))
%time test_feature = apply_daisy(X[3],(h,w))
test_feature.shape
0.027 * len(X) # approximate how long it may run
# apply to entire data, row by row,
# takes about a minute to run
%time daisy_features = np.apply_along_axis(apply_daisy, 1, X, (h,w))
print(daisy_features.shape)
from sklearn.metrics.pairwise import pairwise_distances
# find the pairwise distance between all the different image features
%time dist_matrix = pairwise_distances(daisy_features)
import copy
# find closest image to current image
idx1 = 5
distances = copy.deepcopy(dist_matrix[idx1,:])
distances[idx1] = np.infty # dont pick the same image!
idx2 = np.argmin(distances)
plt.figure(figsize=(7,10))
plt.subplot(1,2,1)
imshow(X[idx1].reshape((h,w)))
plt.title("Original Image")
plt.grid()
plt.subplot(1,2,2)
imshow(X[idx2].reshape((h,w)))
plt.title("Closest Image")
plt.grid()
from ipywidgets import fixed
# put it together inside a nice widget
def closest_image(dmat,idx1):
distances = copy.deepcopy(dmat[idx1,:]) # get all image diatances
distances[idx1] = np.infty # dont pick the same image!
idx2 = np.argmin(distances)
distances[idx2] = np.infty
idx3 = np.argmin(distances)
plt.figure(figsize=(10,16))
plt.subplot(1,3,1)
imshow(X[idx1].reshape((h,w)))
plt.title("Original Image "+names[y[idx1]])
plt.grid()
plt.subplot(1,3,2)
imshow(X[idx2].reshape((h,w)))
plt.title("Closest Image "+names[y[idx2]])
plt.grid()
plt.subplot(1,3,3)
imshow(X[idx3].reshape((h,w)))
plt.title("Next Closest Image "+names[y[idx3]])
plt.grid()
widgets.interact(closest_image,idx1=(0,n_samples-1,1),dmat=fixed(dist_matrix),__manual=True)
from skimage.filters import gabor_kernel
from scipy import ndimage as ndi
from scipy import stats
# prepare filter bank kernels
kernels = []
for theta in range(4):
theta = theta / 4. * np.pi
for sigma in (1, 3):
for frequency in (0.05, 0.25):
kernel = np.real(gabor_kernel(frequency, theta=theta,
sigma_x=sigma, sigma_y=sigma))
kernels.append(kernel)
# compute the filter bank and take statistics of image
def compute_gabor(row, kernels, shape):
feats = np.zeros((len(kernels), 4), dtype=np.double)
for k, kernel in enumerate(kernels):
filtered = ndi.convolve(row.reshape(shape), kernel, mode='wrap')
_,_,feats[k,0],feats[k,1],feats[k,2],feats[k,3] = stats.describe(filtered.reshape(-1))
# mean, var, skew, kurt
return feats.reshape(-1)
idx_to_reconstruct = int(np.random.rand(1)*len(X))
gabr_feature = compute_gabor(X[idx_to_reconstruct], kernels, (h,w))
gabr_feature
# takes ~3 minutes to run entire dataset
%time gabor_stats = np.apply_along_axis(compute_gabor, 1, X, kernels, (h,w))
print(gabor_stats.shape)
from sklearn.metrics.pairwise import pairwise_distances
# find the pairwise distance between all the different image features
%time dist_matrix_gabor = pairwise_distances(gabor_stats)
widgets.interact(closest_image,idx1=(0,n_samples-1,1),dmat=fixed(dist_matrix_gabor),__manual=True)
Is gabor filtering a good feature extraction method for this dataset?